export CUDA_VISIBLE_DEVICES=2

# 256*256 bz 1 30GB
# 512*512 bz 1 35GB
# 3072*1024 bz 1 39GB

accelerate launch \
--config_file accelerate_config.yaml \
train_flux_concat3img.py \
--jsonl_for_train zhenzhi_train_data.json \
--cache_dir /mnt/nas/shengjie/cache/ \
--image_column concat_img \
--resolution_height 1024 \
--resolution_width 3072 \
--output_dir /mnt/nas/shengjie/zhenzhi_output/ \
--logging_dir logs \
--mixed_precision bf16 \
--pretrained_model_name_or_path /home/shengjie/ckp/FLUX.1-Fill-dev \
--train_batch_size 1 \
--dataloader_num_workers 4 \
--max_train_steps 20000 \
--checkpointing_steps 1000 \
--rank 128 \
--gradient_checkpointing \
--use_8bit_adam \
--learning_rate 1e-4 \
--lr_warmup_steps 0
# --offload 
# --quant_transformers